1 Fig3

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

1.0.0.1 =================

1.1 Del load ratio

choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"

### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") 
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>% 
  select(Taxa=VcfID,GenomeType,GroupbyContinent)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl  (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("AT","WE","DE","FTT","LR_EU","CL","LR_EA")
factorB <- c("Ratio_002_nonsynonymous","Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5","Ratio_007_VEP")
labelsB <- c("Nonsynonymous","Deleterious","High impact")

df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>% 
  filter(DelCountGroup == "TotalDerivedSNPCount") %>% 
  select(-c("IfSelected","Group_refobj","DelCountGroup")) %>% 
  filter(GroupbyContinent %in% factorA) %>% 
  filter(Stand_LoadGroup %in% factorB) %>% 
  droplevels() %>% 
  mutate(GroupbyContinent= factor(GroupbyContinent, levels = factorA)) %>% 
  mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorB, labels = labelsB))

colB <- c("#87cef9","#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff","gray80")

1.1.1 plot

p <- df %>%
  ggplot(aes(x=Subgenome,y=Stand_delCount,fill=GroupbyContinent))+
  ylab("Mutation burden")+xlab("")+
  ### volin
  geom_violin(position = position_dodge(0.8),alpha=0.8,width=2) +
  geom_boxplot(position = position_dodge(0.8), width=0.01, color="white", alpha=0.2,outlier.color = NA) +
  # geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
  stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+

  ### boxplot
  # geom_boxplot(position = position_dodge(0.8),outlier.color = NA, alpha = 0.8) +
  # geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
  # stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
  
  ### pointrange
  # stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,position = position_dodge(0.8),geom="pointrange",size=0.3)+
  
  scale_color_manual(values = colB,name='')+
  scale_fill_manual(values = colB,name='')+
  facet_grid(Stand_LoadGroup~.,scales = "free")+
  # facet_wrap(Stand_LoadGroup~.,scales = "free", ncol = 1)+
  # ggtitle("Del total count under all genes in vmap 2.1-2021") +
  theme(strip.background = element_blank())+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.4)
    ,legend.position = 'none'
    ,text = element_text(size = 9))
p
## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
ggsave(p,filename = "figs/Fig3/misc/whole_load.pdf",height = 3,width = 4)
## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

1.1.2 significance test - slightly

  • 在分析显著性差异前,先将数据框赋予一个新的变量,存储下来。
dfdf <- df
  • 先计算平均值 标准差之类的
### 平均数,标准差,方差等统计
  
df <- df %>% 
  filter(Stand_LoadGroup == "Nonsynonymous") %>% 
  mutate(GroupbyContinent = factor(GroupbyContinent)) %>% 
  group_by(Subgenome,GroupbyContinent) %>% 
summarise(n=n(),mean=mean(Stand_delCount),median=median(Stand_delCount),max=max(Stand_delCount),min=min(Stand_delCount),sd=sd(Stand_delCount), se = sd/sqrt(n))
## `summarise()` has grouped output by 'Subgenome'. You can override using the `.groups` argument.
df
## # A tibble: 16 × 9
## # Groups:   Subgenome [3]
##    Subgenome GroupbyContinent     n  mean median   max   min      sd       se
##    <chr>     <fct>            <int> <dbl>  <dbl> <dbl> <dbl>   <dbl>    <dbl>
##  1 A         WE                  91 0.824  0.825 0.845 0.789 0.0105  0.00110 
##  2 A         DE                  52 0.829  0.830 0.838 0.811 0.00577 0.000801
##  3 A         FTT                 50 0.825  0.827 0.839 0.801 0.00968 0.00137 
##  4 A         LR_EU               58 0.829  0.828 0.846 0.821 0.00498 0.000654
##  5 A         CL                 223 0.829  0.829 0.843 0.807 0.00438 0.000293
##  6 A         LR_EA              127 0.829  0.829 0.840 0.820 0.00395 0.000351
##  7 B         WE                  91 0.747  0.749 0.770 0.718 0.00966 0.00101 
##  8 B         DE                  52 0.751  0.751 0.765 0.738 0.00427 0.000591
##  9 B         FTT                 50 0.753  0.752 0.768 0.736 0.00634 0.000896
## 10 B         LR_EU               58 0.746  0.745 0.753 0.737 0.00367 0.000482
## 11 B         CL                 223 0.750  0.748 0.772 0.738 0.00589 0.000394
## 12 B         LR_EA              127 0.744  0.744 0.760 0.734 0.00392 0.000348
## 13 D         AT                  36 0.790  0.791 0.808 0.764 0.00934 0.00156 
## 14 D         LR_EU               58 0.811  0.811 0.822 0.806 0.00329 0.000432
## 15 D         CL                 223 0.812  0.811 0.823 0.805 0.00309 0.000207
## 16 D         LR_EA              127 0.812  0.811 0.828 0.804 0.00313 0.000277
write.table(df,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t") 

1.2 Del count

### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delCount.txt.gz") 
## Rows: 95766 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Taxa, Subgenome, VariantsGroup, IfSelected, Group_refobj
## dbl (4): TotalDerivedSNPCount, HomoDerivedSNPCount, HeterDerivedSNPCount, Si...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>% 
  select(Taxa=VcfID,GenomeType,GroupbyContinent)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl  (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("AT","WE","DE","FTT","LR_EU","CL","LR_EA")
factorB <- c("002_nonsynonymous","031_GERP16way_1.5andDerived_PolyPhen2_0.5","007_VEP")
labelsB <- c("Nonsynonymous","Deleterious","High impact")

df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>% 
  select(-c("IfSelected","Group_refobj","SiteCountWithMinDepth")) %>% 
  filter(GroupbyContinent %in% factorA) %>% 
  filter(VariantsGroup %in% factorB) %>% 
  droplevels() %>% 
  mutate(GroupbyContinent= factor(GroupbyContinent, levels = factorA)) %>% 
  mutate(VariantsGroup=factor(VariantsGroup,levels = factorB, labels = labelsB))

colB <- c("#87cef9","#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff","gray70")

1.2.1 plot

p <- df %>%
  ggplot(aes(x=Subgenome,y=TotalDerivedSNPCount,fill=GroupbyContinent))+
  labs(y="No. derived SNPs",x="")+
  ### volin
  # geom_violin(position = position_dodge(0.8),alpha=0.8,width=2) +
  # geom_boxplot(position = position_dodge(0.8), width=0.01, color="white", alpha=0.2,outlier.color = NA) +
  # # geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
  # stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+

  ### boxplot
  # geom_boxplot(position = position_dodge(0.8),outlier.color = NA, alpha = 0.8) +
  # geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
  # stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
  
  ### pointrange
  stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,
               fun.args = list(mult = 1),
               position = position_dodge(0.8),geom="pointrange")+
  stat_summary(fun=mean, geom="point", color="white",position = position_dodge(0.8))+
  
  scale_color_manual(values = colB,name='')+
  scale_fill_manual(values = colB,name='')+
  facet_grid(VariantsGroup~.,scales = "free")+
  # facet_wrap(Stand_LoadGroup~.,scales = "free", ncol = 1)+
  # ggtitle("Del total count under all genes in vmap 2.1-2021") +
  theme(strip.background = element_blank())+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.4)
    ,legend.position = 'none'
    ,text = element_text(size = 9))
p

ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
# ggsave(p,filename = "figs/Fig3/misc/whole_load_count.pdf",height = 3,width = 4)
### 写一个小函数,专门计算升高降低百分比
aochangePercent <- function(before, after){
  out <-  (after-before)/before
  print(paste("before",before,"\n","after",after,"out",out))
    return(out)
}

## we - de
print("we -de")
## [1] "we -de"
aochangePercent(df[1,4],df[2,4])
## [1] "before 8966.5 \n after 10039 out 0.119611888696816"
##   TotalDerivedSNPCount
## 1            0.1196119
aochangePercent(df[7,4],df[8,4])
## [1] "before 9958.5 \n after 11256.5 out 0.130340914796405"
##   TotalDerivedSNPCount
## 1            0.1303409
print("******************************")
## [1] "******************************"
## de-ftt
print("de -ftt")
## [1] "de -ftt"
aochangePercent(df[2,4],df[3,4])
## [1] "before 10039 \n after 9872.5 out -0.0165853172626756"
##   TotalDerivedSNPCount
## 1          -0.01658532
aochangePercent(df[8,4],df[9,4])
## [1] "before 11256.5 \n after 9683 out -0.139785901479145"
##   TotalDerivedSNPCount
## 1           -0.1397859
print("******************************")
## [1] "******************************"
## ftt-lr 
print("ftt - lr")
## [1] "ftt - lr"
aochangePercent(df[3,4],df[4,4])
## [1] "before 9872.5 \n after 11204 out 0.134869587237275"
##   TotalDerivedSNPCount
## 1            0.1348696
aochangePercent(df[9,4],df[10,4])
## [1] "before 9683 \n after 10754.5 out 0.110657853970877"
##   TotalDerivedSNPCount
## 1            0.1106579
print("******************************")
## [1] "******************************"
## ae-lr cl
print("ae - lr cl")
## [1] "ae - lr cl"
aochangePercent(df[13,4],df[14,4])
## [1] "before 10619.5 \n after 11844 out 0.115306747021988"
##   TotalDerivedSNPCount
## 1            0.1153067
aochangePercent(df[13,4],df[15,4])
## [1] "before 10619.5 \n after 10357.5 out -0.0246715947078488"
##   TotalDerivedSNPCount
## 1          -0.02467159
print("******************************")
## [1] "******************************"
## lr - cl
print("lr - cl")
## [1] "lr - cl"
aochangePercent(df[4,4],df[5,4])
## [1] "before 11204 \n after 9292 out -0.170653338093538"
##   TotalDerivedSNPCount
## 1           -0.1706533
aochangePercent(df[10,4],df[11,4])
## [1] "before 10754.5 \n after 10329.5 out -0.0395183411595146"
##   TotalDerivedSNPCount
## 1          -0.03951834
aochangePercent(df[14,4],df[15,4])
## [1] "before 11844 \n after 10357.5 out -0.125506585612969"
##   TotalDerivedSNPCount
## 1           -0.1255066
print("******************************")
## [1] "******************************"

1.2.1.1 =============

1.3 XPCLR load ratio

dfdelRatio <- read.delim("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_summary_XPCLR_top005/012_merge/delRatio.txt")

### 导入种质数据库,并选取特定的列
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,GroupbyContinent,IfOnVmap2_S643)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl  (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- dfdelRatio %>% left_join(.,dftaxaDB,by="Taxa")
df$GroupbyContinent <- factor(df$GroupbyContinent, levels = c("WE","DE","FTT","LR_EU","CL"))
colB <- c("#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff")

df$Stand_LoadGroup <- factor(df$Stand_LoadGroup,levels = c("Ratio_002_nonsynonymous","Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5","Ratio_007_VEP"), labels = c("Nonsynonymous","Deleterious","High impact"))

df$Group_refobj <- factor(df$Group_refobj, levels = c("wede","dedurum","lrcul"))

p <- df %>% 
  filter(IfSelected == 1) %>% 
  filter(DelCountGroup == "TotalDerivedSNPCount") %>% 
  ggplot(aes(x=Subgenome,y=Stand_delCount))+
  ylab("Mutation burden")+xlab("")+
  ### pointrange
  stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,
               fun.args = list(mult = 1),
               position=position_dodge(0.8),geom="pointrange")+
  stat_summary(aes(group = GroupbyContinent), fun=mean, geom="point", color="white",position = position_dodge(0.8))+
  scale_color_manual(values = colB,name='')+
  scale_fill_manual(values = colB,name='')+
  facet_grid(Stand_LoadGroup~Group_refobj,scales = "free")+
  theme(strip.background = element_blank())+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 12))
p

# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
ggsave(p,filename = "~/Documents/test.pdf",height =6,width = 6)

1.4 XPCLR load count

dfdelcount <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_summary_XPCLR_top005/012_merge/delCount_xpclr.txt")
## Rows: 3732 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Taxa, Subgenome, VariantsGroup, Group_refobj
## dbl (5): IfSelected, TotalDerivedSNPCount, HomoDerivedSNPCount, HeterDerived...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 导入种质数据库,并选取特定的列
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,GroupbyContinent)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl  (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("002_nonsynonymous","031_GERP16way_1.5andDerived_PolyPhen2_0.5","007_VEP")
labelsA <- c("Nonsynonymous","Deleterious","High impact")

df <- dfdelcount %>% left_join(.,dftaxaDB,by="Taxa") %>% 
  filter(VariantsGroup %in% factorA)

df$GroupbyContinent <- factor(df$GroupbyContinent, levels = c("WE","DE","FTT","LR_EU","CL"))
colB <- c("#ffd702","#7f5701","#016699", "#fc6e6e","#9900ff")
df$VariantsGroup <- factor(df$VariantsGroup,levels = factorA, labels = labelsA)
df$Group_refobj <- factor(df$Group_refobj, levels = c("wede","dedurum","lrcul"))

p <- df %>% 
  filter(IfSelected == 1) %>% 
  ggplot(aes(x=Subgenome,y=TotalDerivedSNPCount))+
  ylab("Derived allele count per accession")+xlab("")+
  ### pointrange
  stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,
               fun.args = list(mult = 1),size=0.4,
               position=position_dodge(0.95),geom="pointrange")+
  stat_summary(aes(group = GroupbyContinent), fun=mean, geom="point", color="white",
               size=0.9,
               position = position_dodge(0.95))+
  scale_color_manual(values = colB,name='')+
  scale_fill_manual(values = colB,name='')+
  facet_grid(VariantsGroup~Group_refobj,scales = "free")+
  theme(strip.background = element_blank(),
        strip.text = element_text(size=6))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5)
    ,legend.position = 'none'
    ,text = element_text(size = 9))
p

# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
# ggsave(p,filename = "~/Documents/test.pdf",height = 2.4,width = 2.4)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 3)

1.4.0.1 ==============

1.5 DelLoad only Hexaploid

choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"

### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") 
## Rows: 371456 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Taxa, Subgenome, IfSelected, Group_refobj, DelCountGroup, Stand_Loa...
## dbl (1): Stand_delCount
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>% 
  select(Taxa=VcfID,GenomeType,GroupbyContinent)
## Rows: 1062 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (33): VcfID, Group, Taxa, Accessions_byLuLab, GenomeType, Genus, Species...
## dbl  (5): Longitude, Latitude, Altitude, ReleasedIntroducedYear, IfOnVmap2_S643
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
factorA <- c("LR_EU","CL","LR_EA")

# factorB <- c("Ratio_002_nonsynonymous","Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5","Ratio_007_VEP")
# labelsB <- c("Nonsynonymous","Deleterious","High impact")

factorB <- c("Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5")
labelsB <- c("Deleterious")

df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>% 
  filter(DelCountGroup == "TotalDerivedSNPCount") %>% 
  select(-c("IfSelected","Group_refobj","DelCountGroup")) %>% 
  filter(GroupbyContinent %in% factorA) %>% 
  filter(Stand_LoadGroup %in% factorB) %>% 
  droplevels() %>% 
  mutate(GroupbyContinent= factor(GroupbyContinent, levels = factorA)) %>% 
  mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorB, labels = labelsB))

colB <- c( "#fc6e6e","#9900ff","gray90")

1.5.1 plot

p <- df %>%
  ggplot(aes(x=Subgenome,y=Stand_delCount,fill=GroupbyContinent))+
  ylab("Mutation burden")+xlab("")+
  ### volin
  # geom_violin(position = position_dodge(0.8),alpha=0.8,width=2) +
  # geom_boxplot(position = position_dodge(0.8), width=0.02, color="white", alpha=0.2) +
  # geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
  # stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+

  ### boxplot
  geom_boxplot(position = position_dodge(0.8),outlier.color = NA, alpha = 0.8) +
  geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 0.8),aes(color=GroupbyContinent),alpha=0.3)+
  stat_summary(fun=mean, geom="point", color="red",position = position_dodge(0.8),size=0.5)+
  
  ### pointrange
  # stat_summary(aes(color=GroupbyContinent),fun.data=mean_sdl,position = position_dodge(0.8),geom="pointrange",size=0.3)+
  
  scale_color_manual(values = colB,name='')+
  scale_fill_manual(values = colB,name='')+
  facet_wrap(Stand_LoadGroup~.,scales = "free", ncol = 1)+
  # ggtitle("Del total count under all genes in vmap 2.1-2021") +
  theme(strip.background = element_blank())+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 12))
p

# ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)
ggsave(p,filename = "figs/Fig3/misc/whole_load.pdf",height = 3,width = 4)

1.5.1.1 ===============

1.6 Introgression

1.6.0.1 ==============

1.7 Data important

# table <- read_tsv("data/fromDaxing/fd/006_introgression_burden/001_popFdIntervalBurden/introgressionDonorBurden.txt.gz") %>% 
#   mutate(Sub=substring(Chr, 2,2)) %>% 
#   mutate(P2=factor(P2,levels = c("LR_AM","LR_AF","LR_CSA","LR_WA","LR_EU","CL","LR_EA"))) %>% 
#   mutate(P3=factor(P3,levels = c("WE","DE","FTT","AT"),labels = c("WE","DE","FTT","AT"))) 
# 
# df <- table %>% 
#   group_by(Chr,Start,End,P3) %>%
#   summarise(across(contains("introgressed"),.fns = sum, na.rm = T))
# 
# df1 <- df %>% 
#   mutate(IntrogressedSynTotalCount=ifelse(NumAccessionIntrogressed==0,NA,IntrogressedSynTotalCount),
#          IntrogressedNonTotalCount=ifelse(NumAccessionIntrogressed==0,NA,IntrogressedNonTotalCount),
#          IntrogressedDelTotalCount=ifelse(NumAccessionIntrogressed==0,NA,IntrogressedDelTotalCount)) %>% 
#   mutate(NonIntrogressedSynTotalCount=ifelse(NumAccessionNonIntrogressed==0,NA,NonIntrogressedSynTotalCount),
#          NonIntrogressedNonTotalCount=ifelse(NumAccessionNonIntrogressed==0,NA,NonIntrogressedNonTotalCount),
#          NonIntrogressedDelTotalCount=ifelse(NumAccessionNonIntrogressed==0,NA,NonIntrogressedDelTotalCount))
# 
# write_tsv(df1,"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/fromDaxing/fd/006_introgression_burden/001_popFdIntervalBurden/002_introgressionDonorBurden.txt.gz")



table <- read_tsv("data/fromDaxing/fd/006_introgression_burden/001_popFdIntervalBurden/002_introgressionDonorBurden.txt.gz") %>% 
  mutate(Sub=str_sub(Chr,2,2)) %>% 
  mutate(Sub=factor(Sub, levels = c("A","B","D")), SubID=str_sub(Chr,1,1)) %>% 
  mutate(IntrogressedWeaklyLoad=(IntrogressedNonTotalCount+IntrogressedDelTotalCount)/IntrogressedSynTotalCount) %>%
  mutate(IntrogressedStronglyLoad=(IntrogressedDelTotalCount)/IntrogressedSynTotalCount) %>% 
  mutate(NonIntrogressedWeaklyLoad=(NonIntrogressedNonTotalCount+NonIntrogressedDelTotalCount)/ NonIntrogressedSynTotalCount) %>% 
  mutate(NonIntrogressedStronglyLoad=(NonIntrogressedDelTotalCount)/NonIntrogressedSynTotalCount)
## Rows: 269106 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): Chr, P3
## dbl (10): Start, End, NumAccessionIntrogressed, NumAccessionNonIntrogressed,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### 结果里有 NA  NaN Inf
dfMaxMin <- table %>% 
  pivot_longer(cols = IntrogressedWeaklyLoad:NonIntrogressedStronglyLoad,names_to = "Group",values_to = "Load") %>% 
  filter(!is.infinite(Load), !is.na(Load)) %>% 
  group_by(Sub,P3,Group) %>% 
  summarise(max=max(Load))
## `summarise()` has grouped output by 'Sub', 'P3'. You can override using the `.groups` argument.
df <- table %>% 
  pivot_longer(cols = IntrogressedWeaklyLoad:NonIntrogressedStronglyLoad,names_to = "Group",values_to = "Load") %>% 
  left_join(y = dfMaxMin,by = c("Sub","P3","Group")) %>% 
  mutate(Load=ifelse(Load>max, max, Load)) %>%  #### 对缺失值的处理
  select(-c("max")) %>% 
  pivot_wider(names_from = "Group", values_from = "Load") %>% 
  ## Normalization
  mutate(WeaklyLoadNew2=(IntrogressedWeaklyLoad-NonIntrogressedWeaklyLoad)/(IntrogressedWeaklyLoad+NonIntrogressedWeaklyLoad),
         StronglyLoadNew2=(IntrogressedStronglyLoad-NonIntrogressedStronglyLoad)/(IntrogressedStronglyLoad+NonIntrogressedStronglyLoad)) %>% 
  pivot_longer(cols = c(WeaklyLoadNew2,StronglyLoadNew2),names_to = "SlightlyOrStrongly",values_to = "IntrogressionBurdenIndex") %>% 
  mutate(SlightlyOrStrongly=factor(SlightlyOrStrongly, levels = c("WeaklyLoadNew2","StronglyLoadNew2"), labels = c("Nonsynonymous","Deleterious")))

df$P3 <- factor(df$P3,levels = c("AT","WE","DE","FTT"))
colB <- c("#87cef9","#ffd702","#7f5701","#016699")

1.8 load ratio (Introgressed/Nonintrogressed) IntrogressionBurdenIndex

p <- df %>% 
  ggplot(aes(x=Sub, y=IntrogressionBurdenIndex, fill=P3))+
  # geom_boxplot()+
  # geom_lv(position = position_dodge(0.8),width.method = "height",color="black",conf=0.9,k=4)+
  # stat_summary(geom = "point",fun = mean,size=1, color="black",
  #              position = position_dodge(0.8))+
  geom_hline(yintercept = 0, linetype=2, color="grey")+
  stat_summary(geom = "pointrange", fun.data = mean_cl_boot, aes(color=P3), 
               position = position_dodge(0.8))+
  labs(y="Introgressed VS\nnonintrogressed burden",x="")+
  # scale_fill_brewer(palette = "Set2")+
  # scale_color_brewer(palette = "Set2")+
  scale_fill_manual(values = colB)+
  scale_color_manual(values = colB)+
  # scale_y_log10(breaks = scales::trans_breaks("log2", function(x) 2^x),
  #               labels = scales::trans_format("log2", scales::math_format(2^.x)))+
  # coord_cartesian(xlim = c(0,1.0))+
  facet_grid(SlightlyOrStrongly~.)+
  theme(
    axis.title.x = element_blank(),
    strip.background = element_blank(),
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    legend.position = 'none',
    text = element_text(size = 9))
p
## Warning: Removed 473029 rows containing non-finite values (stat_summary).

ggsave(p,filename = "~/Documents/test.pdf",height = 2.4,width = 2.4)
## Warning: Removed 473029 rows containing non-finite values (stat_summary).
dftest <- df %>% 
  filter(!is.na(IntrogressionBurdenIndex)) %>% 
  group_by(P3,SlightlyOrStrongly) %>% 
  count()

dftest2 <- df %>% 
  # filter(is.na(IntrogressionBurdenIndex)) %>% 
  group_by(P3,SlightlyOrStrongly) %>% 
  count()

1.9 whole genome distribution

dfall <- df
library(slider)
df <- dfall %>% 
  filter(!is.na(IntrogressionBurdenIndex)) %>% 
  group_by(Chr,P3) %>% 
  mutate(bin=slide(Start,mean,.before = 50,.after = 50) %>% unlist(),
         binBurdenIndex=slide(IntrogressionBurdenIndex, mean,.before = 50,.after = 50) %>% unlist())

#### 设置每条染色体的点
data1 <- data.frame(RefChr = paste(1:7,rep("A",7),sep = ""),x=c(213,327,319,266,254,286,362),y=rep(0,7))
data2 <- data.frame(RefChr = paste(1:7,rep("B",7),sep = ""),x=c(241,348,347,304,201,325,309),y=rep(0,7))
data3 <- data.frame(RefChr = paste(1:7,rep("D",7),sep = ""),x=c(170,268,240,185,187,215,339),y=rep(0,7))
data <- rbind(data1,data2,data3) %>% 
  mutate(Sub=str_sub(RefChr,2,2), SubID=str_sub(RefChr,1,1)) %>% 
  filter(RefChr %in% c("1A","1B","1D"))

p <- df %>%
  group_by(SlightlyOrStrongly) %>% 
  # filter(Chr %in% c("1A","1B","1D")) %>% 
  group_map(~{ggplot(.x,aes(x=bin/1000000,y=binBurdenIndex,color=P3))+
  # geom_point()+
  geom_line(size=0.4)+
  geom_hline(yintercept = 0, linetype=2, color="grey",size=0.4)+
  facet_grid(SubID~Sub)+
  geom_point(aes(x,y),color = "blue",size=0.5,data = data)+
  scale_fill_manual(values = colB)+
  scale_color_manual(values = colB)+
  scale_y_continuous(breaks = seq(-0.3,0.1,by=0.3))+
  # geom_smooth(method = "loess",span=0.2)+
  labs(x="Genomic position (Mb)", y="Introgressed VS 
       nonIntrogressed burden")+
  theme_classic()+
  theme(
        legend.background = element_rect(fill="transparent"),
        strip.background = element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        # axis.title.x = element_blank(),
        axis.line = element_line(size = 0.3),
        text = element_text(size = 9),
        panel.border = element_rect(colour = "transparent",fill = "transparent"))})
p
## [[1]]

## 
## [[2]]

ggsave("~/Documents/temp.pdf",p[[1]],width =4.73 , height = 2.31)


p <- df %>%
  filter(P3=="AT") %>%
  ggplot(aes(x=bin/1000000,y=binBurdenIndex,color=SlightlyOrStrongly))+
  # geom_point()+
  geom_line()+
  geom_point(aes(x,y),color = "blue",size=1,data = data)+
  geom_hline(yintercept = 0, linetype=2, color="grey")+
  facet_grid(SubID~Sub)+
  scale_fill_manual(values = colB)+
  scale_color_manual(values = colB)+
  labs(x="Genomic position (Mb)")+
  # geom_smooth(method = "loess",span=0.2)+
  theme_classic()+
  theme(
        legend.background = element_rect(fill="transparent"),
        strip.background = element_blank(),
        # legend.position = "none", legend.title = element_blank(),
        axis.title.x = element_blank(),
        axis.line = element_blank(),
        # text = element_text(size = 14),
        panel.border = element_rect(colour = "black",fill = "transparent"))

p

ggsave(p,filename = "~/Documents/test.pdf",height = 7.2,width = 7.2)

1.9.0.1 ===============

1.10 Extended Data

1.10.0.1 ===============

1.11 FigS:XPCLR positive control

1.11.1 Important data

dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

df_wede <- df_wede %>% mutate(group="WE vs DE", subID=substr(ChrRef,1,1), sub=substr(ChrRef,2,2))
df_dedm <- df_dedm %>% mutate(group="DE vs Durum", subID=substr(ChrRef,1,1), sub=substr(ChrRef,2,2))
df_lrcl <- df_lrcl %>% mutate(group="LR vs CL", subID=substr(ChrRef,1,1), sub=substr(ChrRef,2,2))

df <- rbind(df_wede,df_dedm,df_lrcl)
df$group <- factor(df$group, levels = c("WE vs DE","DE vs Durum","LR vs CL"))

1.11.2 Start to plot - 循环

  • 一一验证比较慢,故采用循环的方式,瞬间出来
df_positive <- read_tsv("data/020_xpclr/GeneList_forPositiveControl.txt")
## Rows: 20 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): GeneName, Group, ChrRef, XLab
## dbl (2): GenePos, CentromerePos
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aoFactor <- c("Btr-A1","Btr-B1","Q","Tg","Rht-B1","Rht-D1","Ppd-D1","Vrn-A1","Vrn-B1","Vrn-D1","Fhb1","Sm1","Lr10","Lr21","Lr34-Yr18-Sr57-Pm38-Ltn1-Sb1")
for (i in aoFactor) {
  dfgeneinfo <- df_positive %>% filter(GeneName==i)
  
  groupA <- dfgeneinfo$Group
  chrRefA <- dfgeneinfo$ChrRef
  genePosA <- dfgeneinfo$GenePos
  centromerePosA <- dfgeneinfo$CentromerePos
  xlabA <- dfgeneinfo$XLab
  
   p <- df %>% 
    filter(ChrRef == chrRefA, group == groupA) %>%
     ggplot(aes(x=PosRef/1000000, y=XPCLR_100score,col=group))+
     geom_line(size=0.5)+
     geom_point(aes(x=centromerePosA,y=0),color='grey')+ ### 标记着丝粒
    # geom_hline(yintercept = 19.48245)+
    ### 标记蓝色点
    geom_point(aes(x=genePosA,y=0),color='blue')+ ###标记基因
    # geom_point(aes(x=gene2,y=0),color='blue')+
    labs(x=xlabA,y="XP-CLR score")+
    facet_wrap(~group,scales = "free_x",ncol = 1)+
    # scale_x_continuous(breaks = seq(0,600,by = 300))+
    # scale_x_continuous(limits = c(0,50), breaks = seq(0,25,by = 5))+
    scale_y_continuous(breaks = seq(0,40,by = 25))+
    # scale_color_manual(values = colB)+
    scale_color_brewer(palette = "Set2")+
    theme_classic()+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
      ,panel.background = element_blank()
      ,axis.line = element_line(size=0.3, colour = "black")
      ,legend.position = 'none'
      ,text = element_text(size = 12))+
    theme(strip.background = element_rect(fill = "lightgray",color = "white"))

  p
  
  filename <- str_c("~/Documents/",i,".png",sep = "")


# ggsave(p,filename = "~/Documents/test.pdf",height = 2,width = 5)
ggsave(p,filename = filename,height = 2,width = 5) ###由于线条过多,故保存成图片
}

1.11.2.1 ======

1.12 xpclr-whole genome

1.13 ***XPCLR score

1.14 WE vs. DE manhattan plot for add signal dot

################################# ##
#                               #
#  XPCLR genome line -- manhattan 100kb 50 kb 
#                               #
################################# ##
library(tidyverse)
# ### log2 snp method whole-genome SNP 3%
dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

### ******************************************************************************
### choice: 1.df 2.color 3.threshold  
### ******************************************************************************
### 1. data
df <- df_wede
# df <- df_dedm
# df <- df_lrcl
### 2.color
test <- "wede"
# test <- "dedm"
# test <- "lrcl"
colB <- NULL
labelB <- NULL

# colB1 <- c("#ffd702","#7f5701") ### we de
# colB2 <- c("#7f5701","#016699") ### de durum
# colB3 <- c("#fc6e6e","#9900ff")  ### lr cl
## use subgenome color
colB1 <- c("#fd8582","#967bce") ### we de
colB2 <- c("#fd8582","#967bce") ### de durum
colB3 <- c("#fd8582","#967bce","#4bcdc6")  ### lr cl

### if else failed to use
if(test == "wede"){
  colB <- colB1
  labelB <- "WE vs. DE"
} 
if(test == "dedm"){
  colB <- colB2
  labelB <- "DE vs. Durum"
}
if(test == "lrcl"){
  colB <- colB3
  labelB <- "Landrace vs. Cultivar"
}

### 3. 
# top <- 0.01
top <- 0.05

# head(df)
# nrow(df)
### step2: 修改列名,使之固定
names(df)[names(df)=="XPCLR_100score"] = "Ave"
names(df)[names(df)=="CHROM"] = "Chr"
names(df)[names(df)=="ChrRef"] = "CHROM"
names(df)[names(df)=="PosRef"] = "BIN_START"
max(df$Ave)
## [1] 50
min(df$Ave)
## [1] 0
### step3: 去除NA和无限小的值
df <- dplyr::filter(df,!is.na(Ave) & is.finite(Ave))
nrow(df)
## [1] 99770
head(df)
##   Chr Grid N_SNPs     POS Genetic_pos CHROM BIN_START        Ave
## 1   1    0     29 1146726    0.054851    1A   1146726 2.17052022
## 2   1    1      3 1246726    0.067086    1A   1246726 1.02820104
## 3   1    2      1 1346726    0.068195    1A   1346726 0.04414899
## 4   1    3      0 1446726    0.070346    1A   1446726 0.00000000
## 5   1    4      0 1546726    0.072685    1A   1546726 0.00000000
## 6   1    5      0 1646726    0.075024    1A   1646726 0.00000000
max(df$Ave) # 48.50477
## [1] 50
min(df$Ave) # 0
## [1] 0
### step4: 获取TOP K 截距
### step4a: 排序
df_sorted <- df %>% 
  dplyr::arrange(-Ave) 
head(df_sorted)
##   Chr Grid N_SNPs       POS Genetic_pos CHROM BIN_START Ave
## 1   1 3655    200 366646726    0.718376    1A 366646726  50
## 2   2    0    108     18349    0.694691    1A 471322354  50
## 3   2    1    108    118349    0.694691    1A 471422354  50
## 4   3  648    200  64982586    0.557290    1B  64982586  50
## 5   4  604    200  60427650    0.743404    1B 499147804  50
## 6   7 1331    200 133192290    0.564941    2A 133192290  50
### step4b: 获取top1% 的值,即水平截距
threshod <- df_sorted[top*nrow(df_sorted),8]
threshod
## [1] 25.2817
# write.table(d,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t") 
### step5: 建立累积坐标系统和x轴标签位置
don <- df %>%
  # Compute chromosome size
  group_by(CHROM) %>%
  summarise(chr_len=max(BIN_START)) %>%  ##每条染色体的最大值赋值给 chr_len
  # Calculate cumulative position of each chromosome
  mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>%  ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
  select(-chr_len) %>% ##去除 chr_len这一列
  # Add this info to the initial dataset
  left_join(df, ., by=c("CHROM"="CHROM")) %>%  ## 将原数据框和don数据框进行合并
  # Add a cumulative position of each SNP
  arrange(CHROM, BIN_START) %>% ##根据CHROM BIN_START 进行排序
  mutate( BPcum=BIN_START+tot)  ##增加新的一列,改变每行点的位置

### 记录增量
dfadd <- df %>%
  # Compute chromosome size
  group_by(CHROM) %>%
  summarise(chr_len=max(BIN_START)) %>%  ##每条染色体的最大值赋值给 chr_len
  # Calculate cumulative position of each chromosome
  mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>%  ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
  select(-chr_len)  ##去除 chr_len这一列
# dfadd  

## x轴标签位置:求出每条染色体在坐标轴上的中间位置
axisdf = don %>% group_by(CHROM) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )

### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
geneList <- read.table("data/020_xpclr/gene.txt",sep = "\t",header = T)
geneList
##      Gene Chromosome     Start       End Strand  Length
## 1       Q         5A 650127221 650130900     -1    3679
## 2      Q2         5B 658092362 658095144     -1    2782
## 3      Q3         5D 521712714 521716500     -1    3786
## 4      Tg         2B  36446325  36446572     -1     247
## 5  Btr-A1         3A  65701207  65869644      1  168437
## 6  Btr-B1         3B  85239993  88971836      1 3731843
## 7  Btr-D1         3D  55350411  55779958      1  429547
## 8  Btr-A2         3A  65700170  65829460      1  129290
## 9  Btr-B2         3B  85237203  88202545      1 2965342
## 10 Btr-D2         3D  55349356  56444751      1 1095395
## 11 Vrn-A1         5A 587411824 587423240     -1   11416
## 12 Vrn-B1         5B 573803238 573815903     -1   12665
## 13 Vrn-D1         5D 467176964 467183469     -1    6505
## 14 Vrn-A2         5A 698178738 698180607     -1    1869
## 15 Vrn-B2         4B 657515589 657517678      1    2089
## 16 Vrn-D2         4D 509338928 509340956     -1    2028
## 17 Vrn-A3         7A  71669854  71670886      1    1032
## 18 Vrn-B3         7B   9702478   9703735      1    1257
## 19 Vrn-D3         7D  68416507  68417532     -1    1025
## 20 Ppd-A1         2A  36936874  36938065     -1    1191
## 21 Ppd-D1         2D  33952488  33955629     -1    3141
## 22 Rht-A1         4A 582477716 582479578     -1    1862
## 23 Rht-B1         4B  30861382  30863282      1    1900
## 24 Rht-D1         4D  18781062  18782933      1    1871
geneList$Gene <- as.vector(geneList$Gene)
geneList$Chromosome <- as.vector(geneList$Chromosome)
geneList$Start <- as.numeric(geneList$Start)  ## 为了可以进行预测
geneList$End <- as.numeric(geneList$End)

### 写一个函数,返回基因的标签,以及在图上的range位置
aoGetGenepos <- function(geneRow){
  gene <- geneList[geneRow,] ### get gene's dataframe, only has one line
  labelgene <- gene[1,1]### get gene's label
  chrGene <- as.vector(gene[1,2]) ### get gene's chr
  posSignal <- gene[,3:4] ### get gene's pos, a dataframe
  posOnchr <- c(posSignal[1,1],posSignal[1,2]) ### change to x 
  ### select chr, only one line
  dfadd_one <- dfadd %>% 
    filter(CHROM == chrGene) 
  # dfadd_one
  posSignalCum <- posOnchr + as.numeric(dfadd_one[1,2])
  # geneSummary <- list( labelgene,as.numeric(posSignalCum[1]),as.numeric(posSignalCum[2]))
  geneSummary <- list( labelgene,posSignalCum[1],posSignalCum[2])
  
  return(geneSummary)
}
 
#### btr1 
btra1 <- aoGetGenepos(5)
btrb1 <- aoGetGenepos(6)
btra2 <- aoGetGenepos(8)
btrb2 <- aoGetGenepos(9)

### step6: 颜色设置并画图
### 开始作图

### sample
don <- sample_frac(don,0.5)
p <- ggplot(don, aes(x=BPcum, y=Ave)) +  ##新的横坐标,Y轴的XPCLR值不变
  # Show all points
  # geom_point( aes(color=as.factor(CHROM)), alpha=0.2, size=0.5) +  ## alpha=0.5 first test
  geom_line(aes(color=as.factor(CHROM)), alpha=0.9, size=0.2) +  ## alpha=0.5 first test
  annotate("text",x=Inf,y= threshod ,hjust= 2,vjust= -0.3,label="top 5%", size = 3)+
  geom_hline(yintercept = threshod, linetype= "dashed", color = "gray",size=0.2)+
  annotate("text",x=0,y= Inf ,hjust= -0.2,vjust= 1.1,label= labelB, size = 2)+ ## add group annotation
  scale_color_manual(values = rep(colB, 14 )) + ##合计42条染色体。故21*2=42
  # custom X axis: 每条染色体对应一个中间位置,完美!
  scale_x_continuous(expand = c(0,0), label = axisdf$CHROM, breaks= axisdf$center) +
  scale_y_continuous(expand = c(0,1),limits = c(-6,54)) +
  # scale_y_continuous(expand = c(0, 0),limits = c(-1,52) ) +     # remove space between plot area and x axis
  
  ## add arrow to represent the selection signal 
  # geom_vline(xintercept = unlist(btra1[2]),color="black",size=0.2)+
  # geom_segment(aes(x=unlist(btra1[2]) ,xend = unlist(btra1[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(btra1[2]) ,xend = unlist(btra1[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") + 
  annotate(geom = "text",x = unlist(btra1[3]),y=0.2,label= unlist(btra1[1]),fontface = 'italic',hjust = -0.2,size=1.4,color="blue") +

  # geom_segment(aes(x=unlist(btrb1[2]) ,xend = unlist(btrb1[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(btrb1[2]),xend = unlist(btrb1[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(btrb1[3]),y=0.2,label=unlist(btrb1[1]),fontface = 'italic',hjust = -0.2,size=1.4,color="blue") +
  
  # annotate(geom = "segment",x=unlist(btra2[2]),xend = unlist(btra2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(btra2[3]),y=-4,label=unlist(btra2[1]),fontface = 'italic',hjust = -0.3,size=1.4,color="blue") +
  # geom_vline(xintercept = unlist(btra2[2]),color="red",size=0.5)+
  
  # annotate(geom = "segment",x=unlist(btrb2[2]),xend = unlist(btrb2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(btrb2[3]),y=-4,label=unlist(btrb2[1]),fontface = 'italic',hjust = -0.3,size=1.4,color="blue") +
  # geom_vline(xintercept = unlist(btrb2[2]),color="red",size=0.5)+
  
   # Custom the theme:
  xlab("")+ylab("XP-CLR")+
  theme(plot.margin=unit(rep(1,4),'lines'))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 9),legend.position = 'none')

p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.png",width = 7,height = 7*3/16)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",width = 7,height = 7*3/16)

1.15 DE vs. Durum manhattan plot for add signal dot

################################# ##
#                               #
#  XPCLR genome line -- manhattan 100kb 50 kb 
#                               #
################################# ##
library(tidyverse)

dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

# df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

### ******************************************************************************
### choice: 1.df 2.color 3.threshold  
### ******************************************************************************
### 1. data
# df <- df_wede
df <- df_dedm
# df <- df_lrcl
### 2.color
# test <- "wede"
test <- "dedm"
# test <- "lrcl"
colB <- NULL
labelB <- NULL

# colB1 <- c("#ffd702","#7f5701") ### we de
# colB2 <- c("#7f5701","#016699") ### de durum
# colB3 <- c("#fc6e6e","#9900ff")  ### lr cl
## use subgenome color
colB1 <- c("#fd8582","#967bce") ### we de
colB2 <- c("#fd8582","#967bce") ### de durum
colB3 <- c("#fd8582","#967bce","#4bcdc6")  ### lr cl

### if else failed to use
if(test == "wede"){
  colB <- colB1
  labelB <- "WE vs. DE"
} 
if(test == "dedm"){
  colB <- colB2
  labelB <- "DE vs. Durum"
}
if(test == "lrcl"){
  colB <- colB3
  labelB <- "Landrace vs. Cultivar"
}

### 3. 
 # top <- 0.01
top <- 0.05

# head(df)
# nrow(df)
### step2: 修改列名,使之固定
names(df)[names(df)=="XPCLR_100score"] = "Ave"
names(df)[names(df)=="CHROM"] = "Chr"
names(df)[names(df)=="ChrRef"] = "CHROM"
names(df)[names(df)=="PosRef"] = "BIN_START"
max(df$Ave)
## [1] 50
min(df$Ave)
## [1] 0
### step3: 去除NA和无限小的值
df <- dplyr::filter(df,!is.na(Ave) & is.finite(Ave))
nrow(df)
## [1] 99735
head(df)
##   Chr Grid N_SNPs     POS Genetic_pos CHROM BIN_START       Ave
## 1   1    0     18 1146726    0.046440    1A   1146726 0.3066511
## 2   1    1      1 1246726    0.063779    1A   1246726 0.1882992
## 3   1    2      3 1346726    0.064555    1A   1346726 0.1024874
## 4   1    3      0 1446726    0.066497    1A   1446726 0.0000000
## 5   1    4      0 1546726    0.068650    1A   1546726 0.0000000
## 6   1    5      0 1646726    0.070802    1A   1646726 0.0000000
max(df$Ave) # 48.50477
## [1] 50
min(df$Ave) # 0
## [1] 0
### step4: 获取TOP K 截距
### step4a: 排序
df_sorted <- df %>% 
  dplyr::arrange(-Ave) 
head(df_sorted)
##   Chr Grid N_SNPs       POS Genetic_pos CHROM BIN_START Ave
## 1   1 3317    200 332846726    0.703480    1A 332846726  50
## 2   3 1693    200 170513271    0.654870    1B 170513271  50
## 3   4  711    200  71128502    0.744405    1B 509848656  50
## 4   7 1720    200 172103913    0.591699    2A 172103913  50
## 5   8  609    200  60911283    0.610511    2A 523287456  50
## 6   9 1127    200 113039313    0.821076    2B 113039313  50
### step4b: 获取top1% 的值,即水平截距
threshod <- df_sorted[top*nrow(df_sorted),8]
 print(paste(threshod," threshod ", top))
## [1] "25.1736403467324  threshod  0.05"
# write.table(d,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t") 
### step5: 建立累积坐标系统和x轴标签位置
don <- df %>%
  # Compute chromosome size
  group_by(CHROM) %>%
  summarise(chr_len=max(BIN_START)) %>%  ##每条染色体的最大值赋值给 chr_len
  # Calculate cumulative position of each chromosome
  mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>%  ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
  select(-chr_len) %>% ##去除 chr_len这一列
  # Add this info to the initial dataset
  left_join(df, ., by=c("CHROM"="CHROM")) %>%  ## 将原数据框和don数据框进行合并
  # Add a cumulative position of each SNP
  arrange(CHROM, BIN_START) %>% ##根据CHROM BIN_START 进行排序
  mutate( BPcum=BIN_START+tot)  ##增加新的一列,改变每行点的位置

### 记录增量
dfadd <- df %>%
  # Compute chromosome size
  group_by(CHROM) %>%
  summarise(chr_len=max(BIN_START)) %>%  ##每条染色体的最大值赋值给 chr_len
  # Calculate cumulative position of each chromosome
  mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>%  ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
  select(-chr_len)  ##去除 chr_len这一列
dfadd  
## # A tibble: 14 × 2
##    CHROM        tot
##    <chr>      <dbl>
##  1 1A             0
##  2 1B     589322893
##  3 2A    1272771549
##  4 2B    2043759005
##  5 3A    2844781695
##  6 3B    3595305153
##  7 4A    4421263148
##  8 4B    5164818687
##  9 5A    5838260571
## 10 5B    6541414897
## 11 6A    7254289667
## 12 6B    7864636897
## 13 7A    8575918310
## 14 7B    9312369770
# don
# write.table(don,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t")

## x轴标签位置:求出每条染色体在坐标轴上的中间位置
axisdf = don %>% group_by(CHROM) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )

### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
geneList <- read.table("data/020_xpclr/gene.txt",sep = "\t",header = T)
geneList
##      Gene Chromosome     Start       End Strand  Length
## 1       Q         5A 650127221 650130900     -1    3679
## 2      Q2         5B 658092362 658095144     -1    2782
## 3      Q3         5D 521712714 521716500     -1    3786
## 4      Tg         2B  36446325  36446572     -1     247
## 5  Btr-A1         3A  65701207  65869644      1  168437
## 6  Btr-B1         3B  85239993  88971836      1 3731843
## 7  Btr-D1         3D  55350411  55779958      1  429547
## 8  Btr-A2         3A  65700170  65829460      1  129290
## 9  Btr-B2         3B  85237203  88202545      1 2965342
## 10 Btr-D2         3D  55349356  56444751      1 1095395
## 11 Vrn-A1         5A 587411824 587423240     -1   11416
## 12 Vrn-B1         5B 573803238 573815903     -1   12665
## 13 Vrn-D1         5D 467176964 467183469     -1    6505
## 14 Vrn-A2         5A 698178738 698180607     -1    1869
## 15 Vrn-B2         4B 657515589 657517678      1    2089
## 16 Vrn-D2         4D 509338928 509340956     -1    2028
## 17 Vrn-A3         7A  71669854  71670886      1    1032
## 18 Vrn-B3         7B   9702478   9703735      1    1257
## 19 Vrn-D3         7D  68416507  68417532     -1    1025
## 20 Ppd-A1         2A  36936874  36938065     -1    1191
## 21 Ppd-D1         2D  33952488  33955629     -1    3141
## 22 Rht-A1         4A 582477716 582479578     -1    1862
## 23 Rht-B1         4B  30861382  30863282      1    1900
## 24 Rht-D1         4D  18781062  18782933      1    1871
geneList$Gene <- as.vector(geneList$Gene)
geneList$Chromosome <- as.vector(geneList$Chromosome)
geneList$Start <- as.numeric(geneList$Start)  ## 为了可以进行预测
geneList$End <- as.numeric(geneList$End)

### 写一个函数,返回基因的标签,以及在图上的range位置
aoGetGenepos <- function(geneRow){
  gene <- geneList[geneRow,] ### get gene's dataframe, only has one line
  labelgene <- gene[1,1]### get gene's label
  chrGene <- as.vector(gene[1,2]) ### get gene's chr
  posSignal <- gene[,3:4] ### get gene's pos, a dataframe
  posOnchr <- c(posSignal[1,1],posSignal[1,2]) ### change to x 
  ### select chr, only one line
  dfadd_one <- dfadd %>% 
    filter(CHROM == chrGene) 
  # dfadd_one
  posSignalCum <- posOnchr + as.numeric(dfadd_one[1,2])
  # geneSummary <- list( labelgene,as.numeric(posSignalCum[1]),as.numeric(posSignalCum[2]))
  geneSummary <- list( labelgene,posSignalCum[1],posSignalCum[2])
  
  return(geneSummary)
}
 
#### btr1 
q <- aoGetGenepos(1)
q2 <- aoGetGenepos(2)
tg <- aoGetGenepos(4)

### step6: 颜色设置并画图
### 开始作图

### sample
don <- sample_frac(don,0.5)
p <- ggplot(don, aes(x=BPcum, y=Ave)) +  ##新的横坐标,Y轴的XPCLR值不变
  # Show all points
  # geom_point( aes(color=as.factor(CHROM)), alpha=0.2, size=0.5) +  ## alpha=0.5 first test
  geom_line(aes(color=as.factor(CHROM)), alpha=0.9, size=0.2) +  ## alpha=0.5 first test
  annotate("text",x=Inf,y= threshod ,hjust= 2,vjust= -0.3,label="top 5%", size = 3)+
  geom_hline(yintercept = threshod, linetype= "dashed", color = "gray",size=0.2)+
  annotate("text",x=0,y= Inf ,hjust= -0.2,vjust= 1.1,label= labelB, size = 2)+ ## add group annotation
  scale_color_manual(values = rep(colB, 14 )) + ##合计42条染色体。故21*2=42
  # custom X axis: 每条染色体对应一个中间位置,完美!
  scale_x_continuous(expand = c(0,0), label = axisdf$CHROM, breaks= axisdf$center) +
  scale_y_continuous(expand = c(0,1),limits = c(-6,54)) +
  # scale_y_continuous(expand = c(0, 0),limits = c(-1,52) ) +     # remove space between plot area and x axis
  
  ## add arrow to represent the selection signal 
  # geom_vline(xintercept = 5838260571,color="black",size=0.2)+
  # geom_segment(aes(x=6490914897,xend = 6490914897,y=0,yend = Inf) , size=0.2, color="blue") +
  # annotate(geom = "segment",x=6490914897 ,xend = 6490914897,y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") + 
  # annotate(geom = "text",x = 6490914897,y=-3,label= unlist(q[1]),fontface = 'italic',hjust = -0.4,size=1.2,color="blue") +
  # 
  # geom_segment(aes(x=7202289667 ,xend = 7202289667,y=0,yend = Inf) , size=0.2, color="blue") +
  # annotate(geom = "segment",x=7202289667,xend = 7202289667,y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  # annotate(geom = "text",x = 7202289667,y=-3,label=unlist(q2[1]),fontface = 'italic',hjust = -0.4,size=1.2,color="blue") +
  # # 
  # geom_segment(aes(x=2076798318 ,xend = 2076798318,y=0,yend = Inf) , size=0.2, color="blue") +
  # annotate(geom = "segment",x=2076798318,xend = 2080198318,y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  # annotate(geom = "text",x = 2080198318,y=-3,label=unlist(tg[1]),fontface = 'italic',hjust = -0.4,size=1.2,color="blue") +

  
  # geom_segment(aes(x=unlist(q[2]) ,xend = unlist(q[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(q[2]) ,xend = unlist(q[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") + 
  annotate(geom = "text",x = unlist(q[3]),y=-3,label= unlist(q[1]),fontface = 'italic',hjust = -0.4,size=1.4,color="blue") +
  
  # geom_segment(aes(x=unlist(q2[2]) ,xend = unlist(q2[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(q2[2]),xend = unlist(q2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(q2[3]),y=-3,label=unlist(q2[1]),fontface = 'italic',hjust = -0.4,size=1.4,color="blue") +
  
  # geom_segment(aes(x=unlist(tg[2]) ,xend = unlist(tg[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(tg[2]),xend = unlist(tg[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(tg[3]),y=-3,label=unlist(tg[1]),fontface = 'italic',hjust = -0.4,size=1.4,color="blue") +
  
   # Custom the theme:
  xlab("")+ylab("XP-CLR")+
  theme(plot.margin=unit(rep(1,4),'lines'))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 9),legend.position = 'none')

p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.png",width = 7,height = 7*3/16)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",width = 7,height = 7*3/16)

1.16 LR vs. Cultivar manhattan plot for add signal dot

################################# ##
#                               #
#  XPCLR genome line -- manhattan 100kb 50 kb 
#                               #
################################# ##
library(tidyverse)

# ### log2 snp method whole-genome SNP 3%
dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

# df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

### ******************************************************************************
### choice: 1.df 2.color 3.threshold  
### ******************************************************************************
### 1. data
# df <- df_wede
# df <- df_dedm
df <- df_lrcl
### 2.color
# test <- "wede"
# test <- "dedm"
test <- "lrcl"
colB <- NULL
labelB <- NULL

# colB1 <- c("#ffd702","#7f5701") ### we de
# colB2 <- c("#7f5701","#016699") ### de durum
# colB3 <- c("#fc6e6e","#9900ff")  ### lr cl
## use subgenome color
colB1 <- c("#fd8582","#967bce") ### we de
colB2 <- c("#fd8582","#967bce") ### de durum
colB3 <- c("#fd8582","#967bce","#4bcdc6")  ### lr cl

### if else failed to use
if(test == "wede"){
  colB <- colB1
  labelB <- "WE vs. DE"
} 
if(test == "dedm"){
  colB <- colB2
  labelB <- "DE vs. Durum"
}
if(test == "lrcl"){
  colB <- colB3
  labelB <- "Landrace vs. Cultivar"
}

### 3. 
# top <- 0.01
top <- 0.05

# head(df)
# nrow(df)
### step2: 修改列名,使之固定
names(df)[names(df)=="XPCLR_100score"] = "Ave"
names(df)[names(df)=="CHROM"] = "Chr"
names(df)[names(df)=="ChrRef"] = "CHROM"
names(df)[names(df)=="PosRef"] = "BIN_START"
max(df$Ave)
## [1] 50
min(df$Ave)
## [1] 0
### step3: 去除NA和无限小的值
df <- dplyr::filter(df,!is.na(Ave) & is.finite(Ave))
nrow(df)
## [1] 138616
head(df)
##   Chr Grid N_SNPs     POS Genetic_pos CHROM BIN_START       Ave
## 1   1    0     19 1159868    0.051648    1A   1159868 0.0000000
## 2   1    1      0 1259868    0.069976    1A   1259868 0.0000000
## 3   1    2      1 1359868    0.071034    1A   1359868 0.2222032
## 4   1    3      0 1459868    0.073261    1A   1459868 0.0000000
## 5   1    4      0 1559868    0.075514    1A   1559868 0.0000000
## 6   1    5      0 1659868    0.077767    1A   1659868 0.0000000
max(df$Ave) # 48.50477
## [1] 50
min(df$Ave) # 0
## [1] 0
### step4: 获取TOP K 截距
### step4a: 排序
df_sorted <- df %>% 
  dplyr::arrange(-Ave) 
head(df_sorted)
##   Chr Grid N_SNPs       POS Genetic_pos CHROM BIN_START Ave
## 1   2  660    136  66018349    1.038456    1A 537322354  50
## 2   4   14    200   1428502    0.629854    1B 440148656  50
## 3   5 1836    200 183678082    0.625617    1D 183678082  50
## 4   6    0     50     31612    1.029658    1D 452211216  50
## 5   6    1     50    131612    1.029658    1D 452311216  50
## 6   6    2     50    231612    1.029658    1D 452411216  50
### step4b: 获取top1% 的值,即水平截距
threshod <- df_sorted[top*nrow(df_sorted),8]
 print(paste(threshod," threshod ", top))
## [1] "11.6460765420766  threshod  0.05"
# write.table(d,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t") 
### step5: 建立累积坐标系统和x轴标签位置
don <- df %>%
  # Compute chromosome size
  group_by(CHROM) %>%
  summarise(chr_len=max(BIN_START)) %>%  ##每条染色体的最大值赋值给 chr_len
  # Calculate cumulative position of each chromosome
  mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>%  ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
  select(-chr_len) %>% ##去除 chr_len这一列
  # Add this info to the initial dataset
  left_join(df, ., by=c("CHROM"="CHROM")) %>%  ## 将原数据框和don数据框进行合并
  # Add a cumulative position of each SNP
  arrange(CHROM, BIN_START) %>% ##根据CHROM BIN_START 进行排序
  mutate( BPcum=BIN_START+tot)  ##增加新的一列,改变每行点的位置

### 记录增量
dfadd <- df %>%
  # Compute chromosome size
  group_by(CHROM) %>%
  summarise(chr_len=max(BIN_START)) %>%  ##每条染色体的最大值赋值给 chr_len
  # Calculate cumulative position of each chromosome
  mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>%  ##所有之和减去该行的染色体对应的长度,如:1B起始为1A长度,1D起始为1A 1B之和的长度
  select(-chr_len)  ##去除 chr_len这一列
dfadd  
## # A tibble: 21 × 2
##    CHROM        tot
##    <chr>      <dbl>
##  1 1A             0
##  2 1B     590222354
##  3 1D    1275371010
##  4 2A    1767982226
##  5 2B    2542868826
##  6 2D    3343894519
##  7 3A    3990270072
##  8 3B    4740698622
##  9 3D    5564155979
## 10 4A    6179505869
## # … with 11 more rows
# don
# write.table(don,file="/Users/Aoyue/Documents/value.txt",quote=F,col.name=T,row.names=F,sep = "\t")

## x轴标签位置:求出每条染色体在坐标轴上的中间位置
axisdf = don %>% group_by(CHROM) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )

### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
### 建立函数关系,添加驯化信号的位点
geneList <- read.table("data/020_xpclr/gene.txt",sep = "\t",header = T)
geneList
##      Gene Chromosome     Start       End Strand  Length
## 1       Q         5A 650127221 650130900     -1    3679
## 2      Q2         5B 658092362 658095144     -1    2782
## 3      Q3         5D 521712714 521716500     -1    3786
## 4      Tg         2B  36446325  36446572     -1     247
## 5  Btr-A1         3A  65701207  65869644      1  168437
## 6  Btr-B1         3B  85239993  88971836      1 3731843
## 7  Btr-D1         3D  55350411  55779958      1  429547
## 8  Btr-A2         3A  65700170  65829460      1  129290
## 9  Btr-B2         3B  85237203  88202545      1 2965342
## 10 Btr-D2         3D  55349356  56444751      1 1095395
## 11 Vrn-A1         5A 587411824 587423240     -1   11416
## 12 Vrn-B1         5B 573803238 573815903     -1   12665
## 13 Vrn-D1         5D 467176964 467183469     -1    6505
## 14 Vrn-A2         5A 698178738 698180607     -1    1869
## 15 Vrn-B2         4B 657515589 657517678      1    2089
## 16 Vrn-D2         4D 509338928 509340956     -1    2028
## 17 Vrn-A3         7A  71669854  71670886      1    1032
## 18 Vrn-B3         7B   9702478   9703735      1    1257
## 19 Vrn-D3         7D  68416507  68417532     -1    1025
## 20 Ppd-A1         2A  36936874  36938065     -1    1191
## 21 Ppd-D1         2D  33952488  33955629     -1    3141
## 22 Rht-A1         4A 582477716 582479578     -1    1862
## 23 Rht-B1         4B  30861382  30863282      1    1900
## 24 Rht-D1         4D  18781062  18782933      1    1871
geneList$Gene <- as.vector(geneList$Gene)
geneList$Chromosome <- as.vector(geneList$Chromosome)
geneList$Start <- as.numeric(geneList$Start)  ## 为了可以进行预测
geneList$End <- as.numeric(geneList$End)

### 写一个函数,返回基因的标签,以及在图上的range位置
aoGetGenepos <- function(geneRow){
  gene <- geneList[geneRow,] ### get gene's dataframe, only has one line
  labelgene <- gene[1,1]### get gene's label
  chrGene <- as.vector(gene[1,2]) ### get gene's chr
  posSignal <- gene[,3:4] ### get gene's pos, a dataframe
  posOnchr <- c(posSignal[1,1],posSignal[1,2]) ### change to x 
  ### select chr, only one line
  dfadd_one <- dfadd %>% 
    filter(CHROM == chrGene) 
  # dfadd_one
  posSignalCum <- posOnchr + as.numeric(dfadd_one[1,2])
  # geneSummary <- list( labelgene,as.numeric(posSignalCum[1]),as.numeric(posSignalCum[2]))
  geneSummary <- list( labelgene,posSignalCum[1],posSignalCum[2])
  
  return(geneSummary)
}
 
#### btr1 
q <- aoGetGenepos(11)
q2 <- aoGetGenepos(12)
tg <- aoGetGenepos(13)
sig4 <- aoGetGenepos(20)
sig5 <- aoGetGenepos(21)
sig6 <- aoGetGenepos(22)
sig7 <- aoGetGenepos(23)
sig8 <- aoGetGenepos(24)


### step6: 颜色设置并画图
### 开始作图

### sample
# don <- sample_frac(don,0.1)
p <- ggplot(don, aes(x=BPcum, y=Ave)) +  ##新的横坐标,Y轴的XPCLR值不变
  # Show all points
  # geom_point( aes(color=as.factor(CHROM)), alpha=0.2, size=0.5) +  ## alpha=0.5 first test
  geom_line(aes(color=as.factor(CHROM)), alpha=0.9, size=0.2) +  ## alpha=0.5 first test
  annotate("text",x=Inf,y= threshod ,hjust= 2,vjust= -0.3,label="top 5%", size = 3)+
  geom_hline(yintercept = threshod, linetype= "dashed", color = "gray",size=0.2)+
  annotate("text",x=0,y= Inf ,hjust= -0.2,vjust= 1.1,label= labelB, size = 2)+ ## add group annotation
  scale_color_manual(values = rep(colB, 14 )) + ##合计42条染色体。故21*2=42
  # custom X axis: 每条染色体对应一个中间位置,完美!
  scale_x_continuous(expand = c(0,0), label = axisdf$CHROM, breaks= axisdf$center) +
  scale_y_continuous(expand = c(0,1),limits = c(-6,54)) +
  # scale_y_continuous(expand = c(0, 0),limits = c(-1,52)) +     # remove space between plot area and x axis
  
  ## add arrow to represent the selection signal 
  # geom_vline(xintercept = 5838260571,color="black",size=0.2)+
  #geom_segment(aes(x=unlist(q[2]) ,xend = unlist(q[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(q[2]) ,xend = unlist(q[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") + 
  annotate(geom = "text",x = unlist(q[3]),y=-3,label= unlist(q[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
  
  #geom_segment(aes(x=unlist(q2[2]) ,xend = unlist(q2[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(q2[2]),xend = unlist(q2[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(q2[3]),y=-3,label=unlist(q2[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
  
  #geom_segment(aes(x=unlist(tg[2]) ,xend = unlist(tg[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(tg[2]),xend = unlist(tg[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(tg[3]),y=-3,label=unlist(tg[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
  
  # geom_segment(aes(x=unlist(sig4[2]) ,xend = unlist(sig4[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(sig4[2]),xend = unlist(sig4[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(sig4[3]),y=-3,label=unlist(sig4[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
  
  # geom_segment(aes(x=unlist(sig5[2]) ,xend = unlist(sig5[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(sig5[2]),xend = unlist(sig5[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(sig5[3]),y=-3,label=unlist(sig5[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
  
  # geom_segment(aes(x=unlist(sig6[2]) ,xend = unlist(sig6[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(sig6[2]),xend = unlist(sig6[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(sig6[3]),y=-3,label=unlist(sig6[1]),fontface = 'italic',hjust = 1.1,size=1.4,color="blue") +
  
  # geom_segment(aes(x=unlist(sig7[2]) ,xend = unlist(sig7[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(sig7[2]),xend = unlist(sig7[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(sig7[3]),y=-3,label=unlist(sig7[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
  
  # geom_segment(aes(x=unlist(sig8[2]) ,xend = unlist(sig8[2]),y=0,yend = Inf) , size=0.2, color="blue") +
  annotate(geom = "segment",x=unlist(sig8[2]),xend = unlist(sig8[3]),y=-5,yend = 0, arrow=arrow(length = unit(0.08, "cm"), type = "closed"), size=0.1, color="blue") +
  annotate(geom = "text",x = unlist(sig8[3]),y=-3,label=unlist(sig8[1]),fontface = 'italic',hjust = -0.1,size=1.4,color="blue") +
  
   # Custom the theme:
  xlab("")+ylab("XP-CLR")+
  theme(plot.margin=unit(rep(1,4),'lines'))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 9),legend.position = 'none')

p

# ggsave(p,filename = "/Users/Aoyue/Documents/test.png",width = 7,height = 7*3/16)

ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",width = 7,height = 7*3/16)

1.16.1 Top 5%

# ### log2 snp method whole-genome SNP 3%


dataFile <- c("data/020_xpclr/ab_WE_DE_log2_0.0001_200_100000.clrgs.txt.gz")
df_wede <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

### 找到 top5 的值域是多少,并且输出bins数目  
df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dftest <- df %>% slice_max(XPCLR_100score,prop = 0.05)
dffinalline <- dftest %>% slice(n())
regions <- nrow(dftest)
print(paste("Threshod is",round(dffinalline[1,8],3) ,"Regions total",regions))
## [1] "Threshod is 25.282 Regions total 4988"
dataFile <- c("data/020_xpclr/ab_DE_Durum_log2_0.0001_200_100000.clrgs.txt.gz")
df_dedm <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

### 找到 top5 的值域是多少,并且输出bins数目  
df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dftest <- df %>% slice_max(XPCLR_100score,prop = 0.05)
dffinalline <- dftest %>% slice(n())
regions <- nrow(dftest)
print(paste("Threshod is",round(dffinalline[1,8],3) ,"Regions total",regions))
## [1] "Threshod is 25.174 Regions total 4986"
dataFile <- c("data/020_xpclr/abd_log2_0.0001_200_100000.clrgs.txt.gz")
df_lrcl <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)

### 找到 top5 的值域是多少,并且输出bins数目  
df <- read.table(dataFile, header=TRUE, sep="\t",row.names=NULL)
dftest <- df %>% slice_max(XPCLR_100score,prop = 0.05)
dffinalline <- dftest %>% slice(n())
regions <- nrow(dftest)
print(paste("Threshod is",round(dffinalline[1,8],3) ,"Regions total",regions))
## [1] "Threshod is 11.646 Regions total 6930"

1.16.2 Top 5% 和 Top 1% 求值域;求受选择区域的CDS长度及占比

  • a:ind top K and its threshod
  • b:Get slective regions num
  • c:Get proportion of whole-genome region gene region and cds region
  • d:Get gene list and GO analysis

Table: Group TopK Threshod SelectiveSweep GeneNum Len_wholegenome Len_Gene Len_CDS Prop_whole Prop_Gene Prop_CDS

1.16.2.1 Get nonoverlap trans

# dataFile <- c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/001_geneTable/wheat_v1.1_nonoverlap_addPos.txt.gz")
# df <- read.delim(dataFile) %>% 
#   filter(Is_Unique_gene.1.0. == 1) %>% 
#   select(Longest_transcript)
# 
# write_tsv(df,file = "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/006_nonoverlapGene/nonoverlapGene.txt")

1.16.2.2 函数构建及运行

AoGetXPCLRsummary <- function(Group,TopK,bin,dataFile){
  library(bedtoolsr)
  # bin <- 100000
  ### 注意不加 chl mt and chrUn
  genomeSize_aabbdd <- 14066280851
  ### genomeSize_aabb <- 9652829943
  ### genomeSize_dd <- 3951074735 
  ### 注意不加 chr0
  geneSize <- 370393700
  cdsSize <- 130348274
  
  ### @para1 Group
  # Group <- "wede"
  ### @para2 TopK
  # TopK <- top1
  
  # ### log2 snp method whole-genome SNP 3%
  ### 找到 topK 的值域是多少,并且输出bins数目
  df <- read.delim(dataFile) %>% slice_max(XPCLR_100score, prop = TopK)
  dffinalline <- df %>% slice(n()) ### 最后一行的内容,为了找到值域的确切值
  ### @para3 Threshod
  Threshod <- round(dffinalline[1, 8], 3)
  ### @para4 SelectiveSweep
  SelectiveSweep <- nrow(df)
  
  print(paste("Threshod is", Threshod, "Regions total", SelectiveSweep))
  
  ### ============================================================
  ### step1:make bed
  dfquery <- df %>% select(CHROM, POS) %>% mutate(POS_end = POS + bin)
  dfgff3gene <-
    read.delim(
      "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_gene_nonoverlap.bed"
    )
  dfgff3cds <-
    read.delim(
      "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_CDS_nonoverlap.bed"
    )
  ### step2:calculate the length on whole genome, gene region, and cds region
  ### a::genomesize AABBDD : 14066280851 AABB : 9652829943  DD : 3951074735  do not add chl and mt
  dfwhole <- dfquery %>% mutate(Length = POS_end - POS) %>%
    summarise(sum_whole = sum(Length)) %>%
    mutate(prop_whole = sum_whole / genomeSize_aabbdd)
  
  dataFile <-
    c(
      "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/000_chrDB/chrConvertionRule_byAoyue.txt"
    )
  dfchrDB <- read.delim(dataFile) %>% select(CHROM = ChrID, Subgenome)
  
  dfsub <- dfquery %>% mutate(Length = POS_end - POS) %>%
    left_join(.,dfchrDB,by="CHROM") %>% 
    group_by(Subgenome) %>% 
    summarise(sum_whole = sum(Length)) %>%
    mutate(prop_whole = sum_whole / genomeSize_aabbdd) %>% 
    mutate(Group = Group, TopK = TopK)
  
  ### b::目的:保留选择区和 gene 的交集,并且得到交集所在的基因
  dfintersect_gene <-
    bt.intersect(dfgff3gene, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
  ### c::目的:保留选择区和 cds 的交集,并且得到交集所在的基因
  dfintersect_cds <-
    bt.intersect(dfgff3cds, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
  
  ### @para5 GeneNum
  dfgeneSelected <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% mutate(Group = Group, TopK = TopK)

  GeneNum <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% nrow()
  ### @para6 Len_wholegenome
  Len_wholegenome <- dfwhole$sum_whole
  ### @para7 Len_Gene
  Len_Gene <- sum(dfintersect_gene$Subtract)
  ### @para8 Len_CDS
  Len_CDS <- sum(dfintersect_cds$Subtract)
  ### @para9 Prop_whole
  Prop_whole <- dfwhole$prop_whole
  ### @para10 Prop_Gene
  Prop_Gene <- Len_Gene / geneSize
  ### @para11 Prop_CDS
  Prop_CDS <- Len_CDS / cdsSize
  
  dffinal <-
    data.frame(
      Group = Group,
      TopK = TopK,
      Threshod = Threshod,
      SelectiveSweep = SelectiveSweep,
      GeneNum = GeneNum,
      Len_wholegenome = Len_wholegenome,
      Len_Gene = Len_Gene,
      Len_CDS = Len_CDS,
      Prop_whole = Prop_whole,
      Prop_Gene = Prop_Gene,
      Prop_CDS = Prop_CDS
    )
  
  dfout <- list(dffinal,dfsub,dfgeneSelected)  ### 测试失败
  # return(dffinal)
  # return(dfgeneSelected)
  # return(dfsub)
  return(dfout)
}


dataFile1 <-c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_output/ab_WE_DE_log2_0.0001_600_100000.clrgs.txt.gz")
dataFile2 <- c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_output/ab_DE_Durum_log2_0.0001_600_100000.clrgs.txt.gz")
dataFile3 <- c("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/004_output/abd_log2_0.0001_600_100000.clrgs.txt.gz")
 
Group <- c("wede","wede","dedurum","dedurum","lrcl","lrcl")
TopK <- c(0.05,0.01,0.05,0.01,0.05,0.01)
bin <- rep(100000,6)
dataFile <- c(dataFile1,dataFile1,dataFile2,dataFile2,dataFile3,dataFile3) 
dfpara <- data.frame(Group=Group,TopK=TopK,bin=bin,dataFile=dataFile,stringsAsFactors = F)

dfm <- dfpara %>%
  # group_modify(~{.x %>% AoGetXPCLRsummary()})  %>% 
  rowwise() %>% 
  summarise(sum = list(AoGetXPCLRsummary(Group,TopK,bin,dataFile))) %>% 
  unnest()
## Warning in fun(libname, pkgname): bedtoolsr was built with bedtools version 2.30.0 but you have version 2.29.2 installed. Function syntax may have changed and wrapper will not function correctly. To fix this, please install bedtools version 2.30.0 and either add it to your PATH or run:
## options(bedtools.path = \"[bedtools path]\")
## [1] "Threshod is 34.584 Regions total 4992"
## [1] "Threshod is 44.076 Regions total 998"
## [1] "Threshod is 30.094 Regions total 5002"
## [1] "Threshod is 43.019 Regions total 1000"
## [1] "Threshod is 17.012 Regions total 6941"
## [1] "Threshod is 46.155 Regions total 1388"
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(sum)`
### output   
# filename <- c("~/Documents/test.txt")
# write_tsv(dfm,file = filename)

1.16.2.3 单个文件测试

Group <- dfpara$Group[1]
TopK <- dfpara$TopK[1]
bin <- dfpara$bin[1]
dataFile <- dfpara$dataFile[1]

library(bedtoolsr)
  # bin <- 100000
  ### 注意不加 chl mt and chrUn
  genomeSize_aabbdd <- 14066280851
  ### genomeSize_aabb <- 9652829943
  ### genomeSize_dd <- 3951074735 
  ### 注意不加 chr0
  geneSize <- 370393700
  cdsSize <- 130348274
  
  ### @para1 Group
  # Group <- "wede"
  ### @para2 TopK
  # TopK <- top1
  
  # ### log2 snp method whole-genome SNP 3%
  ### 找到 topK 的值域是多少,并且输出bins数目
  df <- read.delim(dataFile) %>% slice_max(XPCLR_100score, prop = TopK)
  dffinalline <- df %>% slice(n()) ### 最后一行的内容,为了找到值域的确切值
  ### @para3 Threshod
  Threshod <- round(dffinalline[1, 8], 3)
  ### @para4 SelectiveSweep
  SelectiveSweep <- nrow(df)
  
  print(paste("Threshod is", Threshod, "Regions total", SelectiveSweep))
## [1] "Threshod is 34.584 Regions total 4992"
  ### ============================================================
  ### step1:make bed
  dfquery <- df %>% select(CHROM, POS) %>% mutate(POS_end = POS + bin)
  dfgff3gene <-
    read.delim(
      "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_gene_nonoverlap.bed"
    )
  dfgff3cds <-
    read.delim(
      "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/009_xpclr/007_bed_onWholeGenome/GFF3_CDS_nonoverlap.bed"
    )
  ### step2:calculate the length on whole genome, gene region, and cds region
  ### a::genomesize AABBDD : 14066280851 AABB : 9652829943  DD : 3951074735  do not add chl and mt
  dfwhole <- dfquery %>% mutate(Length = POS_end - POS) %>%
    summarise(sum_whole = sum(Length)) %>%
    mutate(prop_whole = sum_whole / genomeSize_aabbdd)
  
  dataFile <-
    c(
      "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/000_chrDB/chrConvertionRule_byAoyue.txt"
    )
  dfchrDB <- read.delim(dataFile) %>% select(CHROM = ChrID, Subgenome)
  
  dfsub <- dfquery %>% mutate(Length = POS_end - POS) %>%
    left_join(.,dfchrDB,by="CHROM") %>% 
    group_by(Subgenome) %>% 
    summarise(sum_whole = sum(Length)) %>%
    mutate(prop_whole = sum_whole / genomeSize_aabbdd)
  
  ### b::目的:保留选择区和 gene 的交集,并且得到交集所在的基因
  dfintersect_gene <-
    bt.intersect(dfgff3gene, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
  
  dfintersect_gene_bySub <- dfintersect_gene %>% 
    mutate()
  
  ### c::目的:保留选择区和 cds 的交集,并且得到交集所在的基因
  dfintersect_cds <-
    bt.intersect(dfgff3cds, dfquery) %>% arrange(V1, V2) %>% mutate(Subtract = V3 - V2)
  
  ### @para5 GeneNum
  dfgeneSelected <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% mutate(Group = Group, TopK = TopK)

  GeneNum <- dfintersect_gene %>% group_by(V4) %>% count() %>% distinct(., V4) %>% nrow()
  ### @para6 Len_wholegenome
  Len_wholegenome <- dfwhole$sum_whole
  ### @para7 Len_Gene
  Len_Gene <- sum(dfintersect_gene$Subtract)
  ### @para8 Len_CDS
  Len_CDS <- sum(dfintersect_cds$Subtract)
  ### @para9 Prop_whole
  Prop_whole <- dfwhole$prop_whole
  ### @para10 Prop_Gene
  Prop_Gene <- Len_Gene / geneSize
  ### @para11 Prop_CDS
  Prop_CDS <- Len_CDS / cdsSize
  
  dffinal <-
    data.frame(
      Group = Group,
      TopK = TopK,
      Threshod = Threshod,
      SelectiveSweep = SelectiveSweep,
      GeneNum = GeneNum,
      Len_wholegenome = Len_wholegenome,
      Len_Gene = Len_Gene,
      Len_CDS = Len_CDS,
      Prop_whole = Prop_whole,
      Prop_Gene = Prop_Gene,
      Prop_CDS = Prop_CDS
    )